Set Up, Load, Clean Data

## load packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

## read in data
births = read.csv("data/Yr1116Birth.csv", na.strings = "9999")
deaths = read.csv("data/Yr1116Death.csv")
 
## rewrite NAs
births$SEX[which(births$SEX == 9)] = NA
births$CIGPN[which(births$CIGPN == 99)] = NA
births$CIGFN[which(births$CIGFN == 99)] = NA
births$CIGSN[which(births$CIGSN == 99)] = NA
births$CIGLN[which(births$CIGLN == 99)] = NA
births$PARITY[which(births$PARITY == 99)] = NA
births$PLUR[which(births$PLUR == 99)] = NA
births$GEST[which(births$GEST == 99)] = NA
births$MAGE[which(births$MAGE == 99)] = NA

Exploratory Data Analysis

Birthweight

summary(births$BWTG)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       4    2948    3310    3258    3640    6294     430
sd(births$BWTG, na.rm = TRUE)
## [1] 618.8703
ggplot(births, aes(x= BWTG))+
  geom_histogram() +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 430 rows containing non-finite values (stat_bin).

Birthweight is close to normally distrubted, with a slight left skew, centered around ~3300g with a standard deviation of 600g. There appear to be no large outliers in terms of birthweight. 430 birth weights are missing

Smoking

births = births %>% mutate(
  CIGPN_binary = CIGPN>0,
  CIGFN_binary = CIGFN>0,
  CIGSN_binary = CIGSN>0,
  CIGLN_binary = CIGLN>0
)

births = births %>%
  mutate(total_smoked = CIGFN+CIGSN+CIGLN) %>%
  mutate(smoked_during = total_smoked >0)

mean(births$smoked_during,na.rm=TRUE)
## [1] 0.100175
mean(births$CIGPN_binary, na.rm = TRUE)
## [1] 0.1340957
mean(births$CIGFN_binar, na.rm = TRUE)
## [1] 0.09676666
mean(births$CIGSN_binary,na.rm = TRUE)
## [1] 0.08199303
mean(births$CIGLN_binary, na.rm = TRUE)
## [1] 0.07788803
ggplot(births %>% filter(smoked_during), aes(x = total_smoked))+
  geom_histogram() +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mean(births %>% filter(smoked_during) %>% select(total_smoked) %>% unlist())
## [1] 23.06795
ggplot(births, aes(x= smoked_during, y = BWTG)) +
  geom_boxplot()+
  theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).

lm_smoking_v_non = lm(data=births, BWTG~smoked_during)
summary(lm_smoking_v_non)
## 
## Call:
## lm(formula = BWTG ~ smoked_during, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3276.1  -304.1    40.2   375.9  3012.9 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3281.064      0.764 4294.52   <2e-16 ***
## smoked_duringTRUE -231.237      2.414  -95.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 614.5 on 718931 degrees of freedom
##   (2759 observations deleted due to missingness)
## Multiple R-squared:  0.0126, Adjusted R-squared:  0.0126 
## F-statistic:  9175 on 1 and 718931 DF,  p-value: < 2.2e-16
births = births %>%
  mutate(smoking_type = ifelse(smoked_during, 
      ifelse(CIGPN_binary, "before and during", "during only"),
      ifelse(CIGPN_binary, "before only", "none"))) %>%
  mutate(smoking_type = relevel(as.factor(smoking_type), ref = 4))


ggplot(births, aes(x= smoking_type, y = BWTG)) +
  geom_boxplot()+
  theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).

lm_smoking = lm(data=births, BWTG~smoking_type)
summary(lm_smoking)
## 
## Call:
## lm(formula = BWTG ~ smoking_type, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3276.6  -304.6    40.5   375.4  3012.4 
## 
## Coefficients:
##                                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                   3281.6171     0.7802 4205.990  < 2e-16 ***
## smoking_typebefore and during -232.0954     2.4538  -94.586  < 2e-16 ***
## smoking_typebefore only        -13.4168     3.8483   -3.486  0.00049 ***
## smoking_typeduring only       -223.2977    13.0480  -17.114  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 614.5 on 718904 degrees of freedom
##   (2784 observations deleted due to missingness)
## Multiple R-squared:  0.01262,    Adjusted R-squared:  0.01262 
## F-statistic:  3063 on 3 and 718904 DF,  p-value: < 2.2e-16

Around 13% of women smoked in the three months leading up to pregnancy and around 10% of women at any point during their pregnancy. Among those who did smoke during pregnance, the average number of cigarettes smoked during pregnancy was 23. The birthweight of children of smokers was significantly lower than that of the children of nonsmokers, with an average difference of 231 grams. There is also a significant relationship between birthweight and smoking before pregnancy, even for those who did not smoke during pregnancy.

Parity

# Check the parity frequencies
summary(births$PARITY)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.465   3.000  25.000     860
births$PARITY = as.numeric(births$PARITY)
births = births %>% 
  mutate(PARITY_truncated = ifelse(
    PARITY > 4, "5+", PARITY)
  )
  
births$PARITY = as.factor(births$PARITY)
ggplot(data = births, mapping = aes(x = PARITY, y = BWTG)) +
  geom_boxplot() + xlab("Parity") + ylab("Birth weight (g)") + 
  ggtitle("NC Births, 2011-2016") + theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).

ggplot(births, aes(x = PARITY)) +
        stat_count()

# Model without any change, significant up to the low teens
parity_reg = lm(BWTG ~ PARITY, births)
summary(parity_reg)
## 
## Call:
## lm(formula = BWTG ~ PARITY, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3272.9  -303.2    50.9   382.5  3126.8 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 3223.242      1.259 2559.590  < 2e-16 ***
## PARITY2       70.877      1.858   38.155  < 2e-16 ***
## PARITY3       63.680      2.124   29.976  < 2e-16 ***
## PARITY4       40.306      2.645   15.241  < 2e-16 ***
## PARITY5       11.815      3.451    3.424 0.000618 ***
## PARITY6      -11.166      4.751   -2.350 0.018757 *  
## PARITY7      -32.906      6.538   -5.033 4.84e-07 ***
## PARITY8      -76.858      9.209   -8.345  < 2e-16 ***
## PARITY9      -70.364     12.447   -5.653 1.58e-08 ***
## PARITY10     -85.904     17.290   -4.968 6.75e-07 ***
## PARITY11    -126.567     22.324   -5.670 1.43e-08 ***
## PARITY12     -41.074     30.094   -1.365 0.172306    
## PARITY13    -140.997     37.611   -3.749 0.000178 ***
## PARITY14    -157.146     55.261   -2.844 0.004459 ** 
## PARITY15     -20.675     62.728   -0.330 0.741706    
## PARITY16    -158.545     82.550   -1.921 0.054782 .  
## PARITY17     -64.515    107.530   -0.600 0.548530    
## PARITY18     -61.242    145.592   -0.421 0.674019    
## PARITY19      62.758    165.085    0.380 0.703829    
## PARITY20     250.258    195.329    1.281 0.200120    
## PARITY21     112.901    233.462    0.484 0.628673    
## PARITY22     -80.242    356.616   -0.225 0.821972    
## PARITY23    -530.242    617.674   -0.858 0.390645    
## PARITY25     162.758    218.384    0.745 0.456100    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 617.7 on 720622 degrees of freedom
##   (1046 observations deleted due to missingness)
## Multiple R-squared:  0.003304,   Adjusted R-squared:  0.003272 
## F-statistic: 103.9 on 23 and 720622 DF,  p-value: < 2.2e-16
# Rerun EDA with truncated data
births$PARITY_truncated = as.factor(births$PARITY_truncated)
ggplot(births, aes(x = PARITY_truncated)) +
        stat_count()

ggplot(data = births, mapping = aes(x = PARITY_truncated, y = BWTG)) +
  geom_boxplot() + xlab("Parity Truncated") + ylab("Birth weight (g)") + 
  ggtitle("NC Births, 2011-2016")
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).

parity_truncated_reg = lm(BWTG ~ PARITY_truncated, births)
summary(parity_truncated_reg)
## 
## Call:
## lm(formula = BWTG ~ PARITY_truncated, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3272.9  -303.2    48.7   383.1  3061.8 
## 
## Coefficients:
##                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)        3223.242      1.259 2559.251  < 2e-16 ***
## PARITY_truncated2    70.877      1.858   38.150  < 2e-16 ***
## PARITY_truncated3    63.680      2.125   29.972  < 2e-16 ***
## PARITY_truncated4    40.306      2.645   15.239  < 2e-16 ***
## PARITY_truncated5+  -11.939      2.589   -4.612 3.99e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 617.8 on 720641 degrees of freedom
##   (1046 observations deleted due to missingness)
## Multiple R-squared:  0.003014,   Adjusted R-squared:  0.003008 
## F-statistic: 544.6 on 4 and 720641 DF,  p-value: < 2.2e-16

Independent of other variables, we see a negative relationship between parity and birth weight past the first child. The frequency of parity decreases in an exponential fashion. A second variable was created that truncates parities of at least five to improve interprability and prevent overfitting. The quantity of missing data is relatively small.

Plurality

# Check the plurality frequencies
births$PLUR = as.numeric(births$PLUR)
births = births %>% 
  mutate(PLUR_truncated = ifelse(
    PLUR > 2, "3+", PLUR)
  )
births$PLUR = as.factor(births$PLUR)
ggplot(data = births, mapping = aes(x = PLUR, y = BWTG)) +
  geom_boxplot() + xlab("Plurality") + ylab("Birth weight (g)") + 
  ggtitle("NC Births, 2011-2016") + theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).

ggplot(births, aes(x = PLUR)) +
        stat_count()

plurality_reg = lm(BWTG ~ PLUR, births)
summary(plurality_reg)
## 
## Call:
## lm(formula = BWTG ~ PLUR, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3287.1  -302.1    28.9   364.9  3001.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3292.1374     0.7085 4646.93   <2e-16 ***
## PLUR2        -969.8739     3.8495 -251.95   <2e-16 ***
## PLUR3       -1632.6058    21.4519  -76.11   <2e-16 ***
## PLUR4       -1767.3874   132.1681  -13.37   <2e-16 ***
## PLUR5       -1899.7374   186.9125  -10.16   <2e-16 ***
## PLUR6       -2999.9707   241.3023  -12.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 591.1 on 721251 degrees of freedom
##   (435 observations deleted due to missingness)
## Multiple R-squared:  0.08785,    Adjusted R-squared:  0.08784 
## F-statistic: 1.389e+04 on 5 and 721251 DF,  p-value: < 2.2e-16
births$PLUR_truncated = as.factor(births$PLUR_truncated)
ggplot(data = births, mapping = aes(x = PLUR_truncated, y = BWTG)) +
  geom_boxplot() + xlab("Plurality Truncated") + ylab("Birth weight (g)") + 
  ggtitle("NC Births, 2011-2016") + theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).

ggplot(births, aes(x = PLUR_truncated)) +
        stat_count()

plurality_reg_truncated = lm(BWTG ~ PLUR_truncated, births)
summary(plurality_reg_truncated)
## 
## Call:
## lm(formula = BWTG ~ PLUR_truncated, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3287.1  -302.1    28.9   364.9  3001.9 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3292.1374     0.7085  4646.8   <2e-16 ***
## PLUR_truncated2   -969.8739     3.8496  -251.9   <2e-16 ***
## PLUR_truncated3+ -1649.6550    20.9622   -78.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 591.1 on 721254 degrees of freedom
##   (435 observations deleted due to missingness)
## Multiple R-squared:  0.0878, Adjusted R-squared:  0.0878 
## F-statistic: 3.471e+04 on 2 and 721254 DF,  p-value: < 2.2e-16

We see a strong non linear negative relationship between plurality and birth weight. The frequency of pluralities above two is extremely small, and we again see a proportionally small amount of missing data. A second variable was created that truncates pluralities of at least three to improve interprability and prevent overfitting

Gestation

summary(births$GEST)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    15.0    38.0    39.0    38.5    40.0    83.0     539
ggplot(data = births, mapping = aes(x = as.factor(GEST), y = BWTG)) +
  xlab("Gestation Period (weeks)") + 
  ylab("Birth weight (g)") + 
  geom_boxplot() +
  ggtitle("NC Births, 2011-2016") + 
  theme_minimal()
## Warning: Removed 430 rows containing non-finite values (stat_boxplot).

ggplot(births, aes(x = GEST)) +
        stat_count()
## Warning: Removed 539 rows containing non-finite values (stat_count).

gest_reg = lm(BWTG ~ GEST, births)
summary(gest_reg)
## 
## Call:
## lm(formula = BWTG ~ GEST, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9797.7  -299.6   -20.5   276.2  4752.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3898.2669     8.8696  -439.5   <2e-16 ***
## GEST          185.8433     0.2299   808.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 448.1 on 720801 degrees of freedom
##   (889 observations deleted due to missingness)
## Multiple R-squared:  0.4754, Adjusted R-squared:  0.4754 
## F-statistic: 6.532e+05 on 1 and 720801 DF,  p-value: < 2.2e-16
max_gest = max(births$GEST, na.rm=TRUE)
births = births[which(births$GEST != max_gest), ]

There appears to be a non linear positive relationship between gestation period and birth weight. The mean gestational period is approximately 38.5 weeks and the period with the highest median weight is 42 weeks. The frequency distribution is left skewed with the majority of babies having a gestational period between 38 and 40 weeks. There is some concern that more extreme gestational periods may lead to higher variance, and it should be noted that there is a chunk of data points with gestational periods of 17 to 21 weeks that have much higher than expected birth weights. There is one outlier at gestation period of 83. We will assume that this data point was recorded incorrectly and will drop the observation from the dataset.

Age of Mother

ggplot(data = births, mapping = aes(x = MAGE, y = BWTG)) +
  xlab("Age of Mother (years)") + 
  ylab("Birth weight (g)") + 
  geom_smooth(method='lm', na.rm = TRUE) + 
  geom_point() +
  ggtitle("NC Births, 2011-2016") + 
  theme_minimal()
## Warning: Removed 376 rows containing missing values (geom_point).

ggplot(births, aes(x = MAGE)) +
  stat_count() +
  theme_minimal()
## Warning: Removed 26 rows containing non-finite values (stat_count).

mage_reg = lm(BWTG ~ MAGE, births)
summary(mage_reg)
## 
## Call:
## lm(formula = BWTG ~ MAGE, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3330.1  -302.4    49.5   383.3  3053.7 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3065.1300     3.4962  876.70   <2e-16 ***
## MAGE           6.9228     0.1229   56.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 617.3 on 720774 degrees of freedom
##   (376 observations deleted due to missingness)
## Multiple R-squared:  0.004379,   Adjusted R-squared:  0.004378 
## F-statistic:  3170 on 1 and 720774 DF,  p-value: < 2.2e-16

Mother’s age seems to be fairly normally distributed with a mean of 27.8. There appears to be a positive relationship between the age of the mother and the birth weight. There is no evidence to suggest that the birth weight variance is not constant across the mother’s age.

Race of Mother

births$MRACER = as.factor(births$MRACER)
ggplot(data = births, mapping = aes(x = MRACER, y = BWTG)) +
  xlab("Race of Mother") + 
  ylab("Birth weight (g)") + 
  geom_boxplot() +
  ggtitle("NC Births, 2011-2016") + 
  theme_minimal()
## Warning: Removed 350 rows containing non-finite values (stat_boxplot).

bw_race_avgs = births %>%
  group_by(MRACER) %>%
  summarize(mweight = mean(BWTG, na.rm = TRUE), freq_percent = n()/nrow(births))
bw_race_avgs
## # A tibble: 9 x 3
##   MRACER mweight freq_percent
##   <fct>    <dbl>        <dbl>
## 1 0        3292.     0.117   
## 2 1        3335.     0.586   
## 3 2        3070.     0.243   
## 4 3        3197.     0.0141  
## 5 4        3285.     0.00463 
## 6 5        3193.     0.000807
## 7 6        3148      0.000132
## 8 7        3218.     0.00294 
## 9 8        3169.     0.0322
ggplot(births, aes(x = MRACER)) +
        stat_count()

mage_reg = lm(BWTG ~ MRACER, births)
summary(mage_reg)
## 
## Call:
## lm(formula = BWTG ~ MRACER, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3331.2  -301.2    48.3   378.8  3100.3 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 3291.761      2.095 1571.537  < 2e-16 ***
## MRACER1       43.452      2.294   18.938  < 2e-16 ***
## MRACER2     -222.103      2.550  -87.091  < 2e-16 ***
## MRACER3      -95.067      6.395  -14.867  < 2e-16 ***
## MRACER4       -7.259     10.737   -0.676 0.498996    
## MRACER5      -98.311     25.306   -3.885 0.000102 ***
## MRACER6     -143.761     62.455   -2.302 0.021345 *  
## MRACER7      -73.905     13.372   -5.527 3.26e-08 ***
## MRACER8     -122.548      4.510  -27.172  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 608.4 on 720793 degrees of freedom
##   (350 observations deleted due to missingness)
## Multiple R-squared:  0.0328, Adjusted R-squared:  0.03279 
## F-statistic:  3055 on 8 and 720793 DF,  p-value: < 2.2e-16

0 - non-white,, 1 = white, 2 black, 3 indian, 8 other asian

There are significant differences between the average birth weights of mother’s of different races. We see that mother’s that self identified as white have the largest mean baby weight at 3.33 kg, while black mother’s have the lowest mean baby weight at only 3.07 kg. 58 percent of mother’s identify as white, 24 percent identify as black, 12 percent identify as non-white, and 3 percent identify as other asian.

County / Socioeconomic Status

#calculate infant mortality by county, to use as a proxy for socioconomic status
deaths_by_county = deaths %>%
  group_by(cores) %>%
  summarize(n_deaths = n()) %>%
  rename(CORES=cores)

births_by_county = births %>%
  group_by(CORES) %>%
  summarize(n_births = n())

infant_mortality = merge(deaths_by_county, births_by_county, by = "CORES") %>%
  mutate(mortality = n_deaths/n_births*100) %>% #note: mortality rate is as percentage
  select(-n_births, -n_deaths)

births = merge(births, infant_mortality, by = "CORES")

summary(births$mortality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1167  0.6152  0.7013  0.7208  0.7965  1.7606
ggplot(births, aes(x= mortality))+
  geom_histogram() +
  xlab("Mortality Rate (%)")+
  theme_minimal() +
  ggtitle("Histogram of Infant Mortality Rate by County")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

bwtg_vs_mortality = lm(data = births, BWTG~mortality)
summary(bwtg_vs_mortality)
## 
## Call:
## lm(formula = BWTG ~ mortality, data = births)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3287.0  -303.7    45.5   382.8  3026.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3370.715      3.186 1057.90   <2e-16 ***
## mortality   -156.823      4.304  -36.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 618.1 on 720800 degrees of freedom
##   (350 observations deleted due to missingness)
## Multiple R-squared:  0.001839,   Adjusted R-squared:  0.001837 
## F-statistic:  1328 on 1 and 720800 DF,  p-value: < 2.2e-16
ggplot(births, aes(x = mortality, y = BWTG))+
  geom_point() +
  geom_abline(slope = bwtg_vs_mortality$coefficients[[2]], intercept = bwtg_vs_mortality$coefficients[[1]])+
  theme_minimal()
## Warning: Removed 350 rows containing missing values (geom_point).

We chose to use infant mortality rate of birth county as a proxy for socioeconomic status, calculated as number of deaths before the age of 1 divided by total number of births in a county. The median county in the data had a infant mortality rate of 0.7%, with the range of infant mortality rates in our dataset ranging from 0.12% to 1.76%. Infant mortality rate of birth county and birth weight appear to have a weak negative linear relationship, and a 1 percentage point increase in infant mortality rate is associated with a 157g decrease in expected birth weight.

Build Model

births_excl = na.omit(births)
model1 = lm(data = births_excl, BWTG ~ GEST + PARITY_truncated + PLUR + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model1)
## 
## Call:
## lm(formula = BWTG ~ GEST + PARITY_truncated + PLUR + smoking_type + 
##     MAGE + MRACER + mortality, data = births_excl, na.action = "na.exclude")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3406.6  -282.9   -19.5   260.9  4754.9 
## 
## Coefficients:
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                   -3691.8459     9.9298 -371.795  < 2e-16 ***
## GEST                            175.2110     0.2336  749.912  < 2e-16 ***
## PARITY_truncated2                94.7003     1.3162   71.951  < 2e-16 ***
## PARITY_truncated3               113.4122     1.5380   73.742  < 2e-16 ***
## PARITY_truncated4               117.1798     1.9259   60.844  < 2e-16 ***
## PARITY_truncated5+              117.7761     1.9533   60.297  < 2e-16 ***
## PLUR2                          -360.8848     2.9313 -123.116  < 2e-16 ***
## PLUR3                          -459.6481    15.7032  -29.271  < 2e-16 ***
## PLUR4                          -449.5572    95.9190   -4.687 2.78e-06 ***
## PLUR5                          -249.7235   191.8140   -1.302    0.193    
## PLUR6                           -39.7109   175.1356   -0.227    0.821    
## smoking_typebefore and during  -204.8179     1.7697 -115.734  < 2e-16 ***
## smoking_typebefore only         -11.4762     2.7026   -4.246 2.17e-05 ***
## smoking_typeduring only        -165.3536     9.1293  -18.112  < 2e-16 ***
## MAGE                              4.5897     0.0964   47.610  < 2e-16 ***
## MRACER1                          83.5047     1.6596   50.318  < 2e-16 ***
## MRACER2                         -99.3192     1.8206  -54.552  < 2e-16 ***
## MRACER3                          -4.8482     4.5756   -1.060    0.289    
## MRACER4                         -33.5537     7.6016   -4.414 1.01e-05 ***
## MRACER5                        -114.3439    17.9405   -6.373 1.85e-10 ***
## MRACER6                          24.2038    44.0274    0.550    0.582    
## MRACER7                         -14.8690     9.4582   -1.572    0.116    
## MRACER8                        -102.9010     3.2079  -32.077  < 2e-16 ***
## mortality                        23.2031     3.0989    7.487 7.03e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 428.9 on 717845 degrees of freedom
## Multiple R-squared:  0.5184, Adjusted R-squared:  0.5184 
## F-statistic: 3.36e+04 on 23 and 717845 DF,  p-value: < 2.2e-16
plot(model1$fitted.values, model1$residuals)

plot(births_excl$GEST, model1$residuals)

plot(births_excl$PLUR, model1$residuals)

plot(births_excl$PARITY_truncated, model1$residuals)

plot(births_excl$smoking_type, model1$residuals)

plot(births_excl$MRACER, model1$residuals)

plot(births_excl$MAGE, model1$residuals)

births_excl = births_excl %>%
  mutate(GEST2 = GEST^2, GEST3 = GEST ^ 3)
model2 = lm(data = births_excl, BWTG ~ GEST + GEST2 + PARITY_truncated + PLUR + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model2)
## 
## Call:
## lm(formula = BWTG ~ GEST + GEST2 + PARITY_truncated + PLUR + 
##     smoking_type + MAGE + MRACER + mortality, data = births_excl, 
##     na.action = "na.exclude")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3412.1  -283.0   -19.5   260.7  4710.1 
## 
## Coefficients:
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                   -2.941e+03  4.049e+01  -72.632  < 2e-16 ***
## GEST                           1.309e+02  2.328e+00   56.214  < 2e-16 ***
## GEST2                          6.406e-01  3.348e-02   19.136  < 2e-16 ***
## PARITY_truncated2              9.592e+01  1.317e+00   72.814  < 2e-16 ***
## PARITY_truncated3              1.149e+02  1.540e+00   74.645  < 2e-16 ***
## PARITY_truncated4              1.190e+02  1.928e+00   61.733  < 2e-16 ***
## PARITY_truncated5+             1.198e+02  1.956e+00   61.251  < 2e-16 ***
## PLUR2                         -3.558e+02  2.943e+00 -120.897  < 2e-16 ***
## PLUR3                         -4.603e+02  1.570e+01  -29.319  < 2e-16 ***
## PLUR4                         -4.530e+02  9.589e+01   -4.724 2.31e-06 ***
## PLUR5                         -2.735e+02  1.918e+02   -1.426    0.154    
## PLUR6                         -1.452e+02  1.752e+02   -0.829    0.407    
## smoking_typebefore and during -2.041e+02  1.770e+00 -115.319  < 2e-16 ***
## smoking_typebefore only       -1.152e+01  2.702e+00   -4.264 2.01e-05 ***
## smoking_typeduring only       -1.651e+02  9.127e+00  -18.087  < 2e-16 ***
## MAGE                           4.612e+00  9.638e-02   47.846  < 2e-16 ***
## MRACER1                        8.372e+01  1.659e+00   50.458  < 2e-16 ***
## MRACER2                       -9.902e+01  1.820e+00  -54.401  < 2e-16 ***
## MRACER3                       -5.140e+00  4.574e+00   -1.124    0.261    
## MRACER4                       -3.341e+01  7.600e+00   -4.396 1.10e-05 ***
## MRACER5                       -1.135e+02  1.794e+01   -6.330 2.46e-10 ***
## MRACER6                        2.395e+01  4.402e+01    0.544    0.586    
## MRACER7                       -1.328e+01  9.456e+00   -1.404    0.160    
## MRACER8                       -1.020e+02  3.207e+00  -31.797  < 2e-16 ***
## mortality                      2.391e+01  3.098e+00    7.717 1.19e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 428.8 on 717844 degrees of freedom
## Multiple R-squared:  0.5187, Adjusted R-squared:  0.5186 
## F-statistic: 3.223e+04 on 24 and 717844 DF,  p-value: < 2.2e-16
plot(model2$fitted.values, model2$residuals)

plot(births_excl$GEST, model2$residuals)

plot(births_excl$PLUR, model2$residuals)

plot(births_excl$PARITY_truncated, model2$residuals)

plot(births_excl$smoking_type, model2$residuals)

plot(births_excl$MRACER, model2$residuals)

plot(births_excl$MAGE, model2$residuals)

model3 = lm(data = births_excl, BWTG ~ GEST + GEST2 + GEST3 + PARITY_truncated + PLUR + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model3)
## 
## Call:
## lm(formula = BWTG ~ GEST + GEST2 + GEST3 + PARITY_truncated + 
##     PLUR + smoking_type + MAGE + MRACER + mortality, data = births_excl, 
##     na.action = "na.exclude")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3396.9  -280.5   -18.7   258.0  4875.2 
## 
## Coefficients:
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                    1.702e+04  1.867e+02   91.151  < 2e-16 ***
## GEST                          -1.785e+03  1.766e+01 -101.111  < 2e-16 ***
## GEST2                          6.015e+01  5.447e-01  110.436  < 2e-16 ***
## GEST3                         -6.020e-01  5.500e-03 -109.463  < 2e-16 ***
## PARITY_truncated2              8.814e+01  1.308e+00   67.364  < 2e-16 ***
## PARITY_truncated3              1.055e+02  1.529e+00   68.985  < 2e-16 ***
## PARITY_truncated4              1.100e+02  1.914e+00   57.455  < 2e-16 ***
## PARITY_truncated5+             1.118e+02  1.941e+00   57.628  < 2e-16 ***
## PLUR2                         -3.303e+02  2.928e+00 -112.805  < 2e-16 ***
## PLUR3                         -3.509e+02  1.560e+01  -22.488  < 2e-16 ***
## PLUR4                         -2.910e+02  9.512e+01   -3.060  0.00222 ** 
## PLUR5                         -1.592e+01  1.902e+02   -0.084  0.93330    
## PLUR6                         -5.172e+02  1.738e+02   -2.976  0.00292 ** 
## smoking_typebefore and during -2.033e+02  1.755e+00 -115.846  < 2e-16 ***
## smoking_typebefore only       -1.131e+01  2.680e+00   -4.221 2.43e-05 ***
## smoking_typeduring only       -1.651e+02  9.052e+00  -18.240  < 2e-16 ***
## MAGE                           4.592e+00  9.559e-02   48.041  < 2e-16 ***
## MRACER1                        8.254e+01  1.646e+00   50.158  < 2e-16 ***
## MRACER2                       -9.966e+01  1.805e+00  -55.205  < 2e-16 ***
## MRACER3                       -2.099e+00  4.537e+00   -0.463  0.64363    
## MRACER4                       -3.828e+01  7.537e+00   -5.079 3.80e-07 ***
## MRACER5                       -1.184e+02  1.779e+01   -6.658 2.78e-11 ***
## MRACER6                        2.413e+01  4.365e+01    0.553  0.58043    
## MRACER7                       -1.951e+01  9.378e+00   -2.080  0.03750 *  
## MRACER8                       -1.070e+02  3.181e+00  -33.620  < 2e-16 ***
## mortality                      1.952e+01  3.073e+00    6.353 2.12e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 425.2 on 717843 degrees of freedom
## Multiple R-squared:  0.5266, Adjusted R-squared:  0.5265 
## F-statistic: 3.194e+04 on 25 and 717843 DF,  p-value: < 2.2e-16
plot(model3$fitted.values, model3$residuals)

plot(births_excl$GEST, model3$residuals)

plot(births_excl$PLUR, model3$residuals)

plot(births_excl$PARITY_truncated, model3$residuals)

plot(births_excl$smoking_type, model3$residuals)

plot(births_excl$MRACER, model3$residuals)

plot(births_excl$MAGE, model3$residuals)

births_excl = births_excl %>%
  mutate(GEST4 = GEST^4)
model4 = lm(data = births_excl, BWTG ~ GEST + GEST2 + GEST3 + GEST4 + PARITY_truncated + PLUR_truncated + smoking_type + MAGE + MRACER + mortality, na.action = "na.exclude")
summary(model4)
## 
## Call:
## lm(formula = BWTG ~ GEST + GEST2 + GEST3 + GEST4 + PARITY_truncated + 
##     PLUR_truncated + smoking_type + MAGE + MRACER + mortality, 
##     data = births_excl, na.action = "na.exclude")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3398.1  -280.3   -19.4   257.3  4742.4 
## 
## Coefficients:
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                   -1.601e+04  7.646e+02  -20.934  < 2e-16 ***
## GEST                           2.640e+03  1.009e+02   26.157  < 2e-16 ***
## GEST2                         -1.565e+02  4.896e+00  -31.968  < 2e-16 ***
## GEST3                          4.009e+00  1.037e-01   38.660  < 2e-16 ***
## GEST4                         -3.610e-02  8.109e-04  -44.525  < 2e-16 ***
## PARITY_truncated2              8.561e+01  1.308e+00   65.458  < 2e-16 ***
## PARITY_truncated3              1.028e+02  1.528e+00   67.270  < 2e-16 ***
## PARITY_truncated4              1.076e+02  1.912e+00   56.276  < 2e-16 ***
## PARITY_truncated5+             1.104e+02  1.938e+00   56.944  < 2e-16 ***
## PLUR_truncated2               -3.153e+02  2.943e+00 -107.137  < 2e-16 ***
## PLUR_truncated3+              -3.296e+02  1.528e+01  -21.565  < 2e-16 ***
## smoking_typebefore and during -2.024e+02  1.753e+00 -115.467  < 2e-16 ***
## smoking_typebefore only       -1.114e+01  2.676e+00   -4.161 3.16e-05 ***
## smoking_typeduring only       -1.647e+02  9.039e+00  -18.225  < 2e-16 ***
## MAGE                           4.592e+00  9.546e-02   48.105  < 2e-16 ***
## MRACER1                        8.213e+01  1.643e+00   49.981  < 2e-16 ***
## MRACER2                       -9.986e+01  1.803e+00  -55.396  < 2e-16 ***
## MRACER3                       -1.769e+00  4.531e+00   -0.390   0.6963    
## MRACER4                       -4.031e+01  7.527e+00   -5.355 8.57e-08 ***
## MRACER5                       -1.203e+02  1.776e+01   -6.774 1.25e-11 ***
## MRACER6                        2.233e+01  4.359e+01    0.512   0.6085    
## MRACER7                       -2.029e+01  9.365e+00   -2.167   0.0302 *  
## MRACER8                       -1.081e+02  3.177e+00  -34.040  < 2e-16 ***
## mortality                      1.813e+01  3.069e+00    5.908 3.47e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 424.6 on 717845 degrees of freedom
## Multiple R-squared:  0.5279, Adjusted R-squared:  0.5278 
## F-statistic: 3.489e+04 on 23 and 717845 DF,  p-value: < 2.2e-16
plot(model4$fitted.values, model4$residuals)

plot(births_excl$GEST, model4$residuals)

plot(births_excl$PLUR_truncated, model4$residuals)

plot(births_excl$PARITY_truncated, model4$residuals)

plot(births_excl$smoking_type, model4$residuals)

plot(births_excl$MRACER, model4$residuals)

plot(births_excl$MAGE, model4$residuals)